In this piece of paper, a set of data obtained from Annual Status of Education Report (ASER) is explored. The raw data was downloaded from the link here. https://palnetwork.org/aser-centre/
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(gghighlight)
library(stringr)
library(dplyr)
library(sf)
library(scatterplot3d)
library(car)
# library(broom)
require(ggiraph)
## Warning: package 'ggiraph' was built under R version 4.0.3
require(ggiraphExtra)
# require(plyr)
provdist <- read.csv("aser/ASER2016ProvDist.csv")
school <- read.csv("aser/ASER2016GSchool.csv")
child <- read.csv("aser/ASER2016Child.csv")
pschool <- read.csv("aser/ASER2016PvtSchool.csv")
gschool <- read.csv("aser/ASER2016GSchool.csv")
parent <- read.csv("aser/ASER2016Parent.csv")
house <- read.csv("aser/ASER2016HouseholdIndicators.csv")
RegionName <- c("2" = "Panjab",
"3" = "Sindh",
"4" = "Balochistan",
"5" = "Khyber Pakhtunkhwa",
"6" = "Gilgit-Baltistan",
"7" = "Azad Jammu and Kashmir",
"8" = "Islamabad - ICT",
"9" = "Federally Administrated Tribal Areas")
Gender <- c("0" = "Male",
"-1" = "Female")
length(unique(child$CID))
## [1] 255196
The whole samplesize (the numebr of children) of this dataset is 255196.
child %>%
filter(DID == 266) %>%
summarize(N_hunza = length(unique(CID)))
The samplesize of Hunza alone is 1641.
child %>%
filter(DID == 266) %>%
summarize(gender_proportion = mean(C002))
-1: female, 0: male
gender_proportion = -0.5173675 means there are a little more girls in the dataset.
child %>%
filter(DID == 266) %>%
ggplot(aes(C001)) +
geom_histogram()
Age is well sparsed
child %>%
filter(DID == 266) %>%
ggplot(aes(C003)) +
geom_histogram(bins = 3)
1 = never enrolled; 2 = drop-out; 3 = currently enrolled
child %>%
filter(DID == 266) %>%
ggplot(aes(C003)) +
geom_histogram(bins = 3, binwidth = 1) +
facet_grid(~C002, labeller = labeller(C002 = Gender))
Both genders look pretty good interms of the absolute number of currently-enrolled-children
child %>%
filter(DID == 266) %>%
group_by(C002) %>%
mutate(enrollment_rate = mean(C003 == 3)) %>%
ggplot(aes(C002, enrollment_rate)) +
geom_col() +
scale_y_continuous() +
geom_label(aes(label = enrollment_rate)) +
scale_x_continuous(breaks = c(-1, 0), labels = c("Female", "Male"))
As a rate, both are doing pretty good
1 = Government; 2 = Private; 3 = Madrasah(Conventional religious education) School; 4 = Other(Non formal education facility)
child %>%
filter(DID == 266) %>%
ggplot(aes(C006)) +
geom_histogram()
Most children go to public schools or private schools
child %>%
filter(RID == 6) %>%
group_by(DID) %>%
mutate(Current_Enrollment_Rate = mean(C003 == 3)) %>%
ggplot(aes(DID, Current_Enrollment_Rate)) +
geom_count() +
scale_x_continuous(breaks = 260:266, labels = c("Gilgit", "Diamer", "Skardu", "Ghanshe", "Astore", "Ghizer", "Hunza-Nagar"))
Within Gilgit-Baltistan, Hunza is outperforming.
1 = Begginer/Nothing; 2 = Letters; 3 = Words; 4 = Sentences; 5 = Story
child %>%
filter(DID == 266) %>%
ggplot(aes(C010)) +
geom_histogram()
child %>%
filter(DID == 266) %>%
summarize(na = sum(is.na(C010)))
child %>%
filter(DID == 266, C013 != c(3,4)) %>%
ggplot(aes(C010)) +
geom_histogram() +
facet_grid(~C001)
# children at he age of 3 and 4 are removed for they have not data
child %>%
filter(DID == 266) %>%
ggplot(aes(C010)) +
geom_histogram() +
facet_grid(~C002, labeller = labeller(C002 = Gender))
child %>%
filter(DID == 266, C013 != c(3,4)) %>%
ggplot(aes(C013)) +
geom_histogram() +
facet_grid(~C001)
# children at he age of 3 and 4 are removed for they have not data
child %>%
filter(DID == 266) %>%
ggplot(aes(C013)) +
geom_histogram() +
facet_grid(~C002, labeller = labeller(C002 = Gender))
child %>%
filter(DID == 266) %>%
group_by(C002) %>%
summarize(enrollment_rate = mean(C003 == 3, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
child %>%
group_by(DID, C002) %>%
mutate(enrollment_rate = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(DID, enrollment_rate, color = factor(RID), label = C002)) +
geom_point(show.legend = FALSE) +
geom_text(aes(label = C002), show.legend = FALSE, nudge_y = 0.05)
## Warning: Removed 66 rows containing missing values (geom_text).
child %>%
group_by(DID) %>%
mutate(avg = round(mean(C003 == 3), digits = 2)) %>%
ungroup() %>%
ggplot(aes(avg)) +
geom_histogram() +
facet_grid(~RID, labeller = labeller(RID = RegionName)) +
labs(title = "Current Enrollment Rate by Region")
child %>%
filter(C002 == -1) %>%
group_by(DID) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_learning, color = RID)) +
geom_point() +
geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE)
child %>%
filter(C002 == -1) %>%
group_by(DID) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_learning, color = RID)) +
geom_point() +
geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE) +
gghighlight(RID == 6)
It is interesting to note that Gilgit-Baltistan(RID==6) has a huge diversity in average learning levels of girls and Hunza(DID==266) is in the top group of all region.
ica_df <- ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
as.data.frame()
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
ica_df <- ica_df %>% select(Province, Districts, x, y)
ica_df <- ica_df %>% summarize(Province = tolower(Province), Districts = tolower(Districts), x = x, y = y)
child_dname <- child %>% left_join(provdist[-1])
## Joining, by = "DID"
child_dname <- child_dname %>% mutate(dname = tolower(DNAME))
ica_df_3 <- ica_df %>% filter(Province == "sindh")
ica_df_3$Districts <- ica_df_3$Districts %>%
str_replace("ghotki", "gotki") %>%
str_replace("mirpur khas", "mirpurkhas") %>%
str_replace("malir karachi", "karachi-malir-rural") %>%
str_replace("naushahro feroze", "nowshero feroze") %>%
str_replace("kambar shahdad kot", "qambar shahdadkot") %>%
str_replace("sujawal", "sajawal") %>%
str_replace("shaheed benazir abad", "shaheed benazirabad") %>%
str_replace("tando allahyar", "tando allah yar") %>%
as.vector()
child_dname_3 <- child_dname %>% filter(RNAME == "Sindh") %>% left_join(ica_df_3, by = c("dname" = "Districts"))
child_dname_3 %>% group_by(dname) %>% summarize(n = sum(x))
## `summarise()` ungrouping output (override with `.groups` argument)
ica_df_3
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica, aes(x, y, label = DID, color = factor(RID))) +
geom_text(data = child_ica, aes(x, y, label = DID), size = 5, nudge_y = 0.2, check_overlap = TRUE)
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
## Warning: Removed 8541 rows containing missing values (geom_text).
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica, aes(x, y, label = DNAME, shape = factor(RID), color = factor(RID))) +
geom_text(data = child_ica, aes(x, y, label = DNAME), size = 4.5, nudge_y = 0.2, check_overlap = TRUE) +
scale_shape_manual(values = 0:8) +
coord_sf(xlim = c(71, 77.9), ylim = c(34, 37.1))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
## Warning: Removed 8541 rows containing missing values (geom_text).
child_ica %>%
filter(RID == 6) %>%
summarise(xmin = min(x), xmax = max(x), ymin = min(y), ymax = max(y))
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica %>% group_by(DID) %>%
mutate(avg_enroll = mean(C003 == 3)),
aes(x, y, label = avg_enroll, color = avg_enroll))
# geom_text(data = child_ica%>% group_by(DID) %>%
# mutate(avg_enroll = mean(C003 == 3)),
# aes(x, y, avg_enroll), check_overlap = TRUE, nudge_y = 0.5)
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica %>% filter(C002 == -1) %>% group_by(DID) %>%
mutate(avg_enroll = mean(C003 == 3)),
aes(x, y, label = avg_enroll, color = avg_enroll))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 3566 rows containing missing values (geom_point).
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica, aes(x, y, label = C010, color = C010))
# geom_text(data = child_ica, aes(x, y, label = C010), check_overlap = TRUE, nudge_y = 1)
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica %>%
group_by(DID) %>%
mutate(gender_ratio = mean(C002)),
aes(x, y, label = gender_ratio, color = gender_ratio))
0: male, -1: female
Thus, -0.5 indicates the complete gender parity
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = child_ica %>% group_by(DID) %>%
mutate(gender_ratio = mean(C002, na.rm = TRUE)), aes(x, y, label = gender_ratio, color = gender_ratio, size = -gender_ratio))
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = (child_ica %>% filter(C001 == 14:16) %>% group_by(DID) %>%
mutate(avg_learning_level = mean(C010, na.rm = TRUE))), aes(x, y, label = avg_learning_level, color = avg_learning_level, size = avg_learning_level))
ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
ggplot() +
geom_sf() +
geom_point(data = (child_ica %>% filter(C001 == 12:16) %>% group_by(DID) %>%
mutate(gender_ratio = mean(C002, na.rm = TRUE), avg_learning_level = mean(C010, na.rm = TRUE))), aes(x, y, label = gender_ratio, color = gender_ratio, size = avg_learning_level)) +
theme(axis.title = element_text())
child_ica %>%
filter(C002 == c(0,-1)) %>%
group_by(DID, C001, C002) %>%
mutate(avg_local = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_local, color = factor(C002), group = DID)) +
geom_line(show.legend = FALSE) +
labs(x = "Age") +
facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender))
child_ica %>%
filter(C002 == c(0,-1)) %>%
group_by(DID, C001, C002) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
geom_line(show.legend = FALSE) +
labs(x = "Age") +
facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender))
child_ica %>%
filter(C002 != "NA", RID == 6, RID != "NA") %>%
group_by(DID, C001, C002) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
geom_line() +
labs(x = "Age") +
facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
gghighlight(label_key = DNAME, calculate_per_facet = TRUE)
child_ica %>%
filter(C002 != "NA", RID == 6, RID != "NA") %>%
group_by(DID, C001, C002) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
geom_line() +
labs(x = "Age") +
facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
gghighlight(DID == 266,label_key = DNAME, calculate_per_facet = TRUE)
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
child_ica %>%
filter(C002 != "NA", RID == 3) %>%
group_by(DID, C001, C002) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
geom_line() +
labs(x = "Age") +
facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
gghighlight(DID == c(315,316), label_key = DNAME, calculate_per_facet = TRUE)
child_ica %>%
filter(RID == 6, PR004 != "NA") %>%
group_by(DID, PR004) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_enrollment, fill = factor(PR004))) +
geom_col(aes(factor(DID), avg_enrollment, fill = factor(PR004)), position = "dodge")
Within Gilgit-Baltistan, Hunza is the only district where there are more women who have experience of education than those who have never enrolled in schools.
child_ica %>%
filter(RID == 6, PR009 != "NA") %>%
group_by(DID, PR009) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_enrollment, group = PR009, fill = factor(PR009))) +
geom_col(aes(factor(DID), avg_enrollment), position = "dodge")
child_ica %>%
filter(PR009 != "NA", PR004 != "NA") %>%
group_by(DID) %>%
mutate(avg_learning = mean(C003 == 3)) %>%
ggplot(aes(PR004, avg_learning, group = PR004)) +
geom_boxplot(aes(PR004, avg_learning)) +
facet_grid(.~RID)
child_ica %>%
filter(PR009 != "NA", PR004 != "NA") %>%
group_by(DID) %>%
mutate(avg_learning = mean(C003 == 3)) %>%
ggplot(aes(PR009, avg_learning, group = PR009)) +
geom_boxplot(aes(PR009, avg_learning)) +
facet_grid(.~RID)
child_ica %>%
filter(PR009 != "NA", PR004 != "NA") %>%
group_by(DID) %>%
mutate(avg_learning = mean(C003 == 3, na.rm = TRUE), mother_enrollment = mean(PR004, na.rm = TRUE)) %>%
ggplot(aes(mother_enrollment, avg_learning)) +
geom_point()
child_ica %>%
filter(PR009 != "NA", PR004 != "NA") %>%
group_by(DID) %>%
mutate(avg_learning = mean(C003 == 3, na.rm = TRUE), mother_enrollment = mean(PR004, na.rm = TRUE)) %>%
ungroup() %>%
summarize(r = cor(mother_enrollment, avg_learning))
child_ica %>%
filter(PR009 != "NA", PR004 != "NA") %>%
group_by(DID) %>%
mutate(avg_learning = mean(C003 == 3, na.rm = TRUE), father_enrollment = mean(PR009, na.rm = TRUE)) %>%
ggplot(aes(father_enrollment, avg_learning)) +
geom_point()
child_ica %>%
filter(PR009 != "NA", PR004 != "NA") %>%
group_by(DID) %>%
mutate(avg_learning = mean(C003 == 3), father_enrollment = mean(PR009)) %>%
ungroup() %>%
summarize(r = cor(father_enrollment, avg_learning))
child_ica %>%
filter(PR009 != "NA", PR004 != "NA", C003 != "NA", DID == 266, C001 != "NA") %>%
group_by(PR004, C001) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_learning, group = PR004, color = factor(PR004))) +
geom_line(aes(C001, avg_learning))
## Warning: Removed 200 row(s) containing missing values (geom_path).
child_ica %>%
filter(PR009 != "NA", PR004 != "NA", C003 != "NA", DID == 266, C001 != "NA") %>%
group_by(PR009, C001) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_learning, group = PR009, color = factor(PR009))) +
geom_line(aes(C001, avg_learning))
## Warning: Removed 200 row(s) containing missing values (geom_path).
child_ica %>%
filter(PR009 != "NA",
PR004 != "NA",
C003 != "NA",
C001 != "NA",
RID == 6) %>%
group_by(DID, PR004, C001) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_learning, group = PR004, color = factor(PR004))) +
geom_line(aes(C001, avg_learning)) +
facet_grid(.~DID) +
facet_wrap(.~DID)
## Warning: Removed 226 row(s) containing missing values (geom_path).
child_ica %>%
filter(PR009 != "NA",
PR004 != "NA",
C003 != "NA",
C001 != "NA",
RID == 6) %>%
group_by(DID, PR009, C001) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_learning, group = PR009, color = factor(PR009))) +
geom_line(aes(C001, avg_learning)) +
facet_grid(.~DID) +
facet_wrap(.~DID)
## Warning: Removed 226 row(s) containing missing values (geom_path).
child_ica %>%
filter(PR009 != "NA",
PR004 != "NA",
C003 != "NA",
C001 != "NA",
RID == 6) %>%
group_by(DID, PR004, C001) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_enrollment, group = PR004, color = factor(PR004))) +
geom_line(aes(C001, avg_enrollment)) +
facet_grid(.~DID) +
facet_wrap(.~DID)
child_ica %>%
filter(PR009 != "NA",
PR004 != "NA",
C003 != "NA",
C001 != "NA",
RID == 6) %>%
group_by(DID, PR009, C001) %>%
mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>%
ggplot(aes(C001, avg_enrollment, group = PR009, color = factor(PR009))) +
geom_line(aes(C001, avg_enrollment)) +
facet_grid(.~DID) +
facet_wrap(.~DID)
child_ica %>%
group_by(DID, PR006) %>%
ggplot(aes(PR006)) +
geom_histogram(aes(PR006), stat = "count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5)) +
facet_grid(.~RID)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
child_ica %>% left_join(house, by = "HHID") %>%
filter(DID.x == 266) %>%
ggplot(aes(H002.x)) +
geom_histogram(aes(H002.x), stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
child_ica %>%
ggplot(aes(x = factor(DID), y = factor(H002), fill = factor(H002))) +
geom_bar(aes(x = factor(DID), y = factor(H002)), stat = "identity", position = "fill") +
facet_grid(.~RID, scales = "free") +
facet_wrap(.~RID, scales = "free") +
theme(axis.text.x = element_text(angle = 90, size = 7, hjust = 1)) +
coord_cartesian(ylim = c(0,1), clip = "off", expand = FALSE)
child_ica %>%
filter(C003 != "NA", RID == 6) %>%
group_by(DID, H002) %>%
mutate(avg_enrollment = mean(C003 == 3)) %>%
ggplot(aes(factor(DID), avg_enrollment, group = H002, fill = factor(H002))) +
geom_col(position = "dodge")
child_ica %>%
filter(PR004 != "NA", PR009 != "NA", C010 != "NA") %>%
group_by(DID) %>%
mutate(mother_edu_rate = sum(PR004 == -1)/length(PR004)) %>%
ggplot(aes(DID, mother_edu_rate, shape = factor(RID), color = factor(RID))) +
geom_point() +
scale_shape_manual(values = 0:8)
child_ica %>%
filter(PR004 != "NA", PR009 != "NA", C010 != "NA") %>%
group_by(DID) %>%
mutate(mother_edu_rate = sum(PR004 == -1)/length(PR004)) %>% ungroup() %>%
ggplot(aes(factor(RID), mother_edu_rate, group = RID)) +
geom_violin()
child_ica %>%
filter(PR004 != "NA", PR009 != "NA", C010 != "NA", C002 != "NA") %>%
ggplot(aes(PR005)) +
geom_histogram(stat = "count") +
coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
child_ica %>%
ggplot(aes(PR006)) +
geom_histogram(stat = "count") +
coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
child_ica %>%
filter(PR004 != "NA", PR009 != "NA", C010 != "NA", C002 != "NA") %>%
ggplot(aes(PR010)) +
geom_histogram(stat = "count") +
coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
child_ica %>%
ggplot(aes(PR011)) +
geom_histogram(stat = "count") +
coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
child_ica %>%
filter(PR004 != "NA", PR009 != "NA", C010 != "NA", C002 != "NA") %>%
group_by(C005) %>%
summarize(n = n()) %>%
mutate(C005 = fct_reorder(C005, n)) %>%
ggplot(aes(C005, n)) +
geom_col() +
coord_flip()
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica %>%
filter(PR004 != "NA", PR009 != "NA", !is.na(C010), C002 != "NA", !is.na(C005), DID == 266) %>%
group_by(C005) %>%
summarize(n = n()) %>%
mutate(C005 = fct_reorder(C005, n)) %>%
ggplot(aes(C005, n)) +
geom_col() +
coord_flip()
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica_dummy <- child_ica
child_ica_dummy$Panjab <- ifelse(child_ica_dummy$RID == 2, 1, 0)
child_ica_dummy$Sindh <- ifelse(child_ica_dummy$RID == 3, 1, 0)
child_ica_dummy$Balochistan <- ifelse(child_ica_dummy$RID == 4, 1, 0)
child_ica_dummy$Khyber_Pakhtunkhwa <- ifelse(child_ica_dummy$RID == 5, 1, 0)
child_ica_dummy$Gilgit_Baltistan <- ifelse(child_ica_dummy$RID == 6, 1, 0)
child_ica_dummy$Azad_Jammu_and_Kashmir <- ifelse(child_ica_dummy$RID == 7, 1, 0)
child_ica_dummy$Islamabad_ICT <- ifelse(child_ica_dummy$RID == 8, 1, 0)
child_ica_dummy$Federally_Administrated_Tribal_Areas <- ifelse(child_ica_dummy$RID == 9, 1, 0)
child_ica %>%
filter(!is.na(PR004), !is.na(PR009)) %>%
filter(HHID == 96546) %>%
select(CID, PRID, C002, C001, PR001, VID, C003, DID, RID)
child_ica %>%
group_by(HHID) %>%
mutate(n = n()) %>%
ggplot(aes(factor(n), fill = factor(RID))) +
geom_bar(position = "dodge") +
facet_grid(.~RID) +
facet_wrap(.~RID)
child_ica %>%
filter(!is.na(C002), RID == 6) %>%
group_by(DID, C001, C002) %>%
mutate(enroll = mean(C003 == 3)) %>%
ggplot(aes(C001, enroll, group = C002, color = factor(C002))) +
geom_line() +
facet_wrap(.~DID)
child_ica_dummy <- child_ica_dummy %>% filter(!is.na(C002), !is.na(C003), !is.na(PR004), !is.na(PR009))
child_ica_dummy$C002_01 <- ifelse(child_ica_dummy$C002 == -1, 1, 0)
child_ica_dummy$C003_01 <- ifelse(child_ica_dummy$C003 == 3, 1, 0)
child_ica_dummy$PR004_01 <- ifelse(child_ica_dummy$PR004 == -1, 1, 0)
child_ica_dummy$PR009_01 <- ifelse(child_ica_dummy$PR009 == -1, 1, 0)
child_ica_dummy$PR004_PR009_01 <- ifelse(child_ica_dummy$PR004 == -1 | child_ica_dummy$PR009 == -1, 1, 0)
n_children_in_household <- child_ica_dummy %>%
group_by(HHID) %>%
summarize(n_children_in_household = length(unique(CID)))
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica_dummy <- child_ica_dummy %>% left_join(n_children_in_household)
## Joining, by = "HHID"
child_ica %>%
summarize(C002_na = sum(is.na(C002)),
C003_na = sum(is.na(C003)),
PR004_na = sum(is.na(PR004)),
PR009_na = sum(is.na(PR009)))
data.frame(original_rows = nrow(child_ica),
eliminated_rows = nrow(child_ica) - nrow(child_ica_dummy),
ratio = (nrow(child_ica)-nrow(child_ica_dummy))/nrow(child_ica))
child_ica %>%
filter(DID == 266) %>%
summarize(C002_na = sum(is.na(C002)),
C003_na = sum(is.na(C003)),
PR004_na = sum(is.na(PR004)),
PR009_na = sum(is.na(PR009)))
data.frame(original_rows = nrow(child_ica %>% filter(DID == 266)),
eliminated_rows = nrow(child_ica %>% filter(DID == 266)) - nrow(child_ica_dummy %>% filter(DID == 266)),
ratio = (nrow(child_ica %>% filter(DID == 266) %>% filter(DID == 266))-nrow(child_ica_dummy %>% filter(DID == 266)))/nrow(child_ica %>% filter(DID == 266)))
glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_01 + C002_01, family = "binomial", data = child_ica_dummy)
ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_01 +
## C002_01, family = "binomial", data = child_ica_dummy)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8850 -1.2383 0.6563 0.8621 1.3071
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.934846 0.013780 67.84 <2e-16 ***
## n_children_in_household -0.055206 0.002935 -18.81 <2e-16 ***
## PR004_PR009_01 0.711645 0.009062 78.53 <2e-16 ***
## C002_01 -0.572083 0.009041 -63.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 299726 on 245746 degrees of freedom
## Residual deviance: 289072 on 245743 degrees of freedom
## AIC: 289080
##
## Number of Fisher Scoring iterations: 4
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
# glm_child_hunza <- glm(C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))
glm_child_hunza <- glm(C003_01 ~ n_children_in_household + PR004_PR009_01 + C002_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))
ggPredict(glm_child_hunza, se = TRUE, show.summary = TRUE, point = TRUE, colorAsFactor = TRUE)
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_01 +
## C002_01, family = "binomial", data = child_ica_dummy %>%
## filter(DID == 266))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5255 0.3335 0.3611 0.4126 0.6056
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.74787 0.35406 7.761 8.43e-15 ***
## n_children_in_household -0.16354 0.07071 -2.313 0.0207 *
## PR004_PR009_01 0.44052 0.21451 2.054 0.0400 *
## C002_01 0.12227 0.19347 0.632 0.5274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 838.79 on 1609 degrees of freedom
## Residual deviance: 826.06 on 1606 degrees of freedom
## AIC: 834.06
##
## Number of Fisher Scoring iterations: 5
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE) +
coord_cartesian(xlim = c(3,5), ylim = c(0.55, 0.9))
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_01 +
## C002_01, family = "binomial", data = child_ica_dummy %>%
## filter(DID == 266))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5255 0.3335 0.3611 0.4126 0.6056
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.74787 0.35406 7.761 8.43e-15 ***
## n_children_in_household -0.16354 0.07071 -2.313 0.0207 *
## PR004_PR009_01 0.44052 0.21451 2.054 0.0400 *
## C002_01 0.12227 0.19347 0.632 0.5274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 838.79 on 1609 degrees of freedom
## Residual deviance: 826.06 on 1606 degrees of freedom
## AIC: 834.06
##
## Number of Fisher Scoring iterations: 5
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
exp(glm_child_hunza$coefficients)
## (Intercept) n_children_in_household PR004_PR009_01
## 15.6093953 0.8491308 1.5535098
## C002_01
## 1.1300567
confint(glm_child_hunza, level = 0.95)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 2.0690843 3.45821705
## n_children_in_household -0.3018419 -0.02433219
## PR004_PR009_01 0.0115443 0.85445608
## C002_01 -0.2575389 0.50262655
exp(confint(glm_child_hunza, level = 0.95))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 7.9175694 31.7602989
## n_children_in_household 0.7394550 0.9759614
## PR004_PR009_01 1.0116112 2.3500958
## C002_01 0.7729515 1.6530574
extractAIC(glm_child_hunza)
## [1] 4.0000 834.0574
extractAIC(glm_child_hunza, k = log(nrow(glm_child_hunza$data)))
## [1] 4.0000 855.5934
glm_child_hunza_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))
anova <- anova(glm_child_hunza_null, glm_child_hunza, test = "Chisq")
anova
step(glm_child_hunza_null, direction = "both",
scope = (~ C001 + C002_01 + PR004_01 + PR009_01))
## Start: AIC=840.79
## C003_01 ~ 1
##
## Df Deviance AIC
## + C001 1 732.49 736.49
## + PR004_01 1 824.25 828.25
## + PR009_01 1 830.82 834.82
## <none> 838.79 840.79
## + C002_01 1 838.58 842.58
##
## Step: AIC=736.49
## C003_01 ~ C001
##
## Df Deviance AIC
## + PR004_01 1 707.11 713.11
## + PR009_01 1 717.26 723.26
## <none> 732.49 736.49
## + C002_01 1 732.49 738.49
## - C001 1 838.79 840.79
##
## Step: AIC=713.11
## C003_01 ~ C001 + PR004_01
##
## Df Deviance AIC
## <none> 707.11 713.11
## + PR009_01 1 705.74 713.74
## + C002_01 1 707.11 715.11
## - PR004_01 1 732.49 736.49
## - C001 1 824.25 828.25
##
## Call: glm(formula = C003_01 ~ C001 + PR004_01, family = "binomial",
## data = child_ica_dummy %>% filter(DID == 266))
##
## Coefficients:
## (Intercept) C001 PR004_01
## -0.4789 0.3039 1.0316
##
## Degrees of Freedom: 1609 Total (i.e. Null); 1607 Residual
## Null Deviance: 838.8
## Residual Deviance: 707.1 AIC: 713.1
# step(glm_child_null, direction = "both",
# scope = (~ C001 + C002 + PR004_01 + PR009_01 +
# Panjab + Sindh + Balochistan + Khyber_Pakhtunkhwa + Gilgit_Baltistan +
# Azad_Jammu_and_Kashmir + Islamabad_ICT))
vif(glm_child_hunza)
## n_children_in_household PR004_PR009_01 C002_01
## 1.087196 1.080729 1.006453
glm_child_hunza <- glm(C003_01 ~ C001 + C002_01 + PR009_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))
ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
##
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR009_01, family = "binomial",
## data = child_ica_dummy %>% filter(DID == 266))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2344 0.1622 0.2536 0.4405 0.9815
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.417574 0.296711 -1.407 0.159
## C001 0.300175 0.032679 9.186 < 2e-16 ***
## C002_01 -0.002948 0.201334 -0.015 0.988
## PR009_01 0.839955 0.210274 3.995 6.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 838.79 on 1609 degrees of freedom
## Residual deviance: 717.26 on 1606 degrees of freedom
## AIC: 725.26
##
## Number of Fisher Scoring iterations: 6
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE) +
coord_cartesian(xlim = c(3,5), ylim = c(0.55, 0.9))
##
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR009_01, family = "binomial",
## data = child_ica_dummy %>% filter(DID == 266))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2344 0.1622 0.2536 0.4405 0.9815
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.417574 0.296711 -1.407 0.159
## C001 0.300175 0.032679 9.186 < 2e-16 ***
## C002_01 -0.002948 0.201334 -0.015 0.988
## PR009_01 0.839955 0.210274 3.995 6.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 838.79 on 1609 degrees of freedom
## Residual deviance: 717.26 on 1606 degrees of freedom
## AIC: 725.26
##
## Number of Fisher Scoring iterations: 6
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
glm_child_hunza <- glm(C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))
ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
##
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial",
## data = child_ica_dummy %>% filter(DID == 266))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2906 0.1572 0.2702 0.4206 1.0017
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.48302 0.28454 -1.698 0.0896 .
## C001 0.30382 0.03258 9.326 < 2e-16 ***
## C002_01 0.00938 0.20272 0.046 0.9631
## PR004_01 1.03155 0.20425 5.051 4.41e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 838.79 on 1609 degrees of freedom
## Residual deviance: 707.11 on 1606 degrees of freedom
## AIC: 715.11
##
## Number of Fisher Scoring iterations: 6
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
child_ica_dummy %>%
filter(DID == 266) %>%
group_by(HHID) %>%
summarize(avg = length(unique(PRID)))
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica_dummy %>%
filter(DID == 266) %>%
group_by(CID) %>%
summarize(avg = (PR004_01 + PR009_01)/2) %>%
ggplot(aes(factor(avg))) +
geom_histogram(stat = "count")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
child_ica_dummy %>% filter(RID == 6) %>% summarize(unique(DID))
sapply(260:266, function(dist){
glm_child_dist <- glm(C003_01 ~ C002_01, family = "binomial", data = child_ica_dummy %>% filter(RID == 6, DID == dist))
glm_child_dist_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy %>% filter(RID == 6, DID == dist))
anova <- anova(glm_child_dist_null, glm_child_dist, test = "Chisq")
data.frame(result = anova$`Pr(>Chi)`[2], DID = dist, chi_0.05 = anova$`Pr(>Chi)`[2] >= 0.05)
})
## [,1] [,2] [,3] [,4] [,5] [,6]
## result 0.02323511 1.711351e-125 0.02518717 0.04226537 0.1507538 0.7348869
## DID 260 261 262 263 264 265
## chi_0.05 FALSE FALSE FALSE FALSE TRUE TRUE
## [,7]
## result 0.6463022
## DID 266
## chi_0.05 TRUE
id <- child_ica_dummy %>% summarize(unique(DID))
id <- as.matrix(id)
id <- as.numeric(id[,1])
dist_logist <- sapply(id, function(dist){
glm_child_dist <- glm(C003_01 ~ C002_01,
family = "binomial",
data = child_ica_dummy %>%
filter(DID == dist))
glm_child_dist_null <- glm(C003_01~1,
family = "binomial",
data = child_ica_dummy %>%
filter(DID == dist))
anova <- anova(glm_child_dist_null, glm_child_dist, test = "Chisq")
data.frame(DID = dist,
chi = anova$`Pr(>Chi)`[2],
chi_largerThan_0.05 = anova$`Pr(>Chi)`[2] >= 0.05,
chi_largerThan_0.1 = anova$`Pr(>Chi)`[2] >= 0.1)
})
anova_dist <- as.data.frame(as.tibble(t(dist_logist)))
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
anova_dist$DID <- as.numeric(anova_dist$DID)
anova_dist$chi <- as.numeric(anova_dist$chi)
anova_dist$chi_largerThan_0.1 <- as.logical(anova_dist$chi_largerThan_0.1)
anova_dist$chi_largerThan_0.05 <- as.logical(anova_dist$chi_largerThan_0.05)
anova_dist %>%
summarize(total_chi_largerThan_0.05 = sum(as.numeric(chi_largerThan_0.05)),
ratio_0.05 = sum(as.numeric(chi_largerThan_0.05))/length(chi_largerThan_0.05),
total_chi_largerThan_0.1 = sum(as.numeric(chi_largerThan_0.1)),
ratio_0.1 = sum(as.numeric(chi_largerThan_0.1))/length(chi_largerThan_0.1))
anova_dist %>%
filter(chi_largerThan_0.05 == TRUE) %>%
select(-chi_largerThan_0.1)
DID_chi_0.05 <- anova_dist %>%
filter(chi_largerThan_0.05 == TRUE) %>%
select(DID) %>%
pull()
DID_chi_0.05
## [1] 146 151 156 158 159 162 163 164 167 171 173 176 178 179 189 196 245 257 264
## [20] 265 266 267 268 269 270 271 272 273 274 276
anova_dist %>%
filter(chi_largerThan_0.1 == TRUE) %>%
select(-chi_largerThan_0.05)
DID_chi_0.1 <- anova_dist %>%
filter(chi_largerThan_0.1 == TRUE) %>%
select(DID) %>%
pull()
child %>%
filter(C002 == -1) %>%
group_by(DID) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_learning)) +
geom_point(data = child %>%
filter(C002 == -1 & DID == DID_chi_0.05) %>%
group_by(DID) %>%
mutate(avg_learning = mean(C010, na.rm = TRUE)), size = 4, color = "red") +
geom_point(aes(color = RID)) +
geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE)
## Warning in DID == DID_chi_0.05: 長いオブジェクトの長さが短いオブジェクトの長さの
## 倍数になっていません
child %>%
filter(DID == DID_chi_0.1) %>%
group_by(DID, C001) %>%
mutate(enroll = mean(C003 == 3)) %>%
ggplot(aes(C001, enroll)) +
geom_point() +
geom_smooth() +
facet_wrap(DID~.)
## Warning in DID == DID_chi_0.1: 長いオブジェクトの長さが短いオブジェクトの長さの
## 倍数になっていません
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
child_ica_dummy %>%
group_by(DID) %>%
mutate(n = length(C003),
dropout = sum(C003 == 2),
dropout_ratio = mean(C003 == 2),
never = sum(C003 == 1),
never_ratio = mean(C003 == 1)) %>%
ggplot(aes(DID, never_ratio, color = factor(RID))) +
geom_point()
child_ica %>%
group_by(HHID) %>%
mutate(n_children = length(unique(CID))) %>%
ungroup() %>%
group_by(DID) %>%
mutate(avg_n_children = mean(n_children)) %>%
ggplot(aes(DID, avg_n_children, color = factor(RID))) +
geom_point()
child_ica %>%
ggplot(aes(C005)) +
geom_histogram(stat = "count") +
coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
unique(child_ica$C005)
## [1] "1" "2" "" "4" "3" "Nursery" "6"
## [8] "7" "5" "8" "Kachi" "10" "9" "KG"
## [15] "PG" "Pakki" "11" "Prep" "ECE" "12" "Hafiz"
glm_child <- glm(C003_01~ C001 + C002_01 + PR004_01, family = "binomial", data = child_ica_dummy)
# glm_child <- glm(enroll01~ C001 + C002 + PR004 + PR009 +
# Panjab + Sindh + Balochistan + Khyber_Pakhtunkhwa +
# Gilgit_Baltistan + Azad_Jammu_and_Kashmir + Islamabad_ICT,
# family = "binomial", data = child_ica_dummy %>% filter(!is.na(C002)))
summary(glm_child)
##
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial",
## data = child_ica_dummy)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4716 -1.0691 0.6141 0.8525 1.4442
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.567479 0.012767 -44.45 <2e-16 ***
## C001 0.174066 0.001340 129.90 <2e-16 ***
## C002_01 -0.562952 0.009369 -60.09 <2e-16 ***
## PR004_01 0.788430 0.010620 74.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 299726 on 245746 degrees of freedom
## Residual deviance: 271929 on 245743 degrees of freedom
## AIC: 271937
##
## Number of Fisher Scoring iterations: 4
exp(glm_child$coefficients)
## (Intercept) C001 C002_01 PR004_01
## 0.5669529 1.1901343 0.5695255 2.1999390
confint(glm_child, level = 0.95)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.5925094 -0.5424624
## C001 0.1714426 0.1766953
## C002_01 -0.5813180 -0.5445920
## PR004_01 0.7676346 0.8092649
exp(confint(glm_child, level = 0.95))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.5529380 0.5813151
## C001 1.1870160 1.1932675
## C002_01 0.5591609 0.5800784
## PR004_01 2.1546637 2.2462561
extractAIC(glm_child)
## [1] 4.0 271937.4
extractAIC(glm_child, k = log(nrow(glm_child$data)))
## [1] 4.0 271979.1
glm_child_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy)
anova(glm_child_null, glm_child, test = "Chisq")
step(glm_child_null, direction = "both",
scope = (~ C001 + C002_01 + PR004_01 + PR009_01))
## Start: AIC=299727.8
## C003_01 ~ 1
##
## Df Deviance AIC
## + C001 1 281139 281143
## + PR009_01 1 293719 293723
## + PR004_01 1 295038 295042
## + C002_01 1 295943 295947
## <none> 299726 299728
##
## Step: AIC=281142.5
## C003_01 ~ C001
##
## Df Deviance AIC
## + PR009_01 1 274422 274428
## + PR004_01 1 275560 275566
## + C002_01 1 277803 277809
## <none> 281139 281143
## - C001 1 299726 299728
##
## Step: AIC=274428
## C003_01 ~ C001 + PR009_01
##
## Df Deviance AIC
## + C002_01 1 270765 270773
## + PR004_01 1 272824 272832
## <none> 274422 274428
## - PR009_01 1 281139 281143
## - C001 1 293719 293723
##
## Step: AIC=270773.3
## C003_01 ~ C001 + PR009_01 + C002_01
##
## Df Deviance AIC
## + PR004_01 1 269072 269082
## <none> 270765 270773
## - C002_01 1 274422 274428
## - PR009_01 1 277803 277809
## - C001 1 289624 289630
##
## Step: AIC=269081.6
## C003_01 ~ C001 + PR009_01 + C002_01 + PR004_01
##
## Df Deviance AIC
## <none> 269072 269082
## - PR004_01 1 270765 270773
## - PR009_01 1 271929 271937
## - C002_01 1 272824 272832
## - C001 1 288269 288277
##
## Call: glm(formula = C003_01 ~ C001 + PR009_01 + C002_01 + PR004_01,
## family = "binomial", data = child_ica_dummy)
##
## Coefficients:
## (Intercept) C001 PR009_01 C002_01 PR004_01
## -0.7815 0.1760 0.5677 -0.5762 0.4923
##
## Degrees of Freedom: 245746 Total (i.e. Null); 245742 Residual
## Null Deviance: 299700
## Residual Deviance: 269100 AIC: 269100
# step(glm_child_null, direction = "both",
# scope = (~ C001 + C002 + PR004_01 + PR009_01 +
# Panjab + Sindh + Balochistan + Khyber_Pakhtunkhwa + Gilgit_Baltistan +
# Azad_Jammu_and_Kashmir + Islamabad_ICT))
vif(glm_child)
## C001 C002_01 PR004_01
## 1.010465 1.004067 1.013630
child_ica %>%
filter(PR009 == 0, !is.na(C008a)) %>%
summarize(n = length(unique(CID)),
paid_tuition = sum(C008a == -1),
ratio = paid_tuition/n)
child_ica %>%
filter(DID == 266, PR009 == 0, !is.na(C008a)) %>%
summarize(n = length(unique(CID)),
paid_tuition = sum(C008a == -1),
ratio = paid_tuition/n)
child_ica %>%
filter(DID == 266, PR004 == 0, !is.na(C008a)) %>%
summarize(n = length(unique(CID)),
paid_tuition = sum(C008a == -1),
ratio = paid_tuition/n)
child_ica %>%
filter(DID == 266, !is.na(PR009)) %>%
summarize(father_edu_ratio = sum(PR009 == -1)/length(PR009))
child_ica %>%
filter(!is.na(PR009)) %>%
summarize(father_edu_ratio = sum(PR009 == -1)/length(PR009))
child_ica %>%
filter(!is.na(PR009)) %>%
group_by(DID) %>%
mutate(father_edu_ratio = sum(PR009 == -1)/length(PR009)) %>%
ggplot(aes(factor(DID), father_edu_ratio)) +
geom_point()
child_ica %>%
filter(!is.na(PR009), !is.na(PR004)) %>%
group_by(DID) %>%
mutate(mother_edu_ratio = sum(PR004 == -1)/length(PR004)) %>%
ggplot(aes(factor(DID), mother_edu_ratio)) +
geom_point()
child_ica %>%
group_by(DID) %>%
mutate(avg_paid = mean(C008a == -1, na.rm = TRUE)) %>%
ggplot(aes(DID, avg_paid)) +
geom_point()+
scale_y_continuous(limits = c(0,1))
child_ica %>%
filter(!is.na(C002)) %>%
group_by(DID) %>%
summarize(avg_paid = mean(C008a == -1),
female = mean(child_ica %>% filter(C002 == -1) %>%
group_by(DID) %>% .$C003 == 3),
male = mean(child_ica %>% filter(C002 == 0) %>%
group_by(DID) %>% .$C003 == 3),
ratio = female/male)
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica %>%
filter(!is.na(C002)) %>%
group_by(DID, C002) %>%
summarize(enroll = mean(C003 == 3)) %>%
ungroup() %>%
spread(C002, enroll) %>%
summarize(DID = DID,
gender_ratio = .$"-1"/.$"0") %>%
ggplot(aes(DID, gender_ratio)) +
geom_point() +
scale_y_continuous(limits = c(0,1))
## `summarise()` regrouping output by 'DID' (override with `.groups` argument)
## Warning: Removed 16 rows containing missing values (geom_point).
child_ica %>%
filter(!is.na(C002)) %>%
group_by(DID) %>%
mutate(paid_tuition = mean(C008a == -1, na.rm = TRUE)) %>%
group_by(DID, C002) %>%
mutate(enroll = mean(C003 == 3)) %>%
ggplot(aes(paid_tuition, enroll)) +
geom_point() +
geom_smooth() +
facet_grid(.~C002)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'